I chose the Lime package (GitHub Repository) and followed the tutorial for continuous tabular data from the official library documentation (Tutorial - Continuous and Categorical Features).
From the first assignment, I obtained the SpeedDating imbalanced dataset with the following features:
This is interesting because the most important feature should be like, but the model can provide other insights.
I retrieved models from the first Homework: Logistic Regression and XGBoost.
The Lime explanations for positive and negative instances from the test set are below.
# Show positive examples and their explanations for logistic regression
for positive_example in positive_examples:
exp = explainer.explain_instance(positive_example, clf_logistic_regression.predict_proba, num_features=10)
exp.show_in_notebook(show_table=True, show_all=False)
# show negative examples and their explanations for logistic regression
for negative_example in negative_examples:
exp = explainer.explain_instance(negative_example, clf_logistic_regression.predict_proba, num_features=10)
exp.show_in_notebook(show_table=True, show_all=False)
The results don't look stable, because the most important factors in general differ in each example.
# Show positive examples and their explanations for xgboost
for positive_example in positive_examples:
exp = explainer.explain_instance(positive_example, clf_xgboost.predict_proba, num_features=10)
exp.show_in_notebook(show_table=True, show_all=False)
# show negative examples and their explanations for xgboost
for negative_example in negative_examples:
exp = explainer.explain_instance(negative_example, clf_xgboost.predict_proba, num_features=10)
exp.show_in_notebook(show_table=True, show_all=False)
We can observe that the results for XGBoost are superior to those of logistic regression. For instance, the like feature is consistently one of the most important in all explanations for XGBoost, which is not the case for logistic regression. The shared_interests_o feature is the second feature that is systematically most important across explanations for XGBoost, as opposed to logistic regression. Positively, both like and shared_interests_o intuitively should be important features.
import os
while 'Homeworks' in os.getcwd():
os.chdir('..')
# pip install pandas matplotlib scikit-learn xgboost plotly lime
import requests
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
import lime
import random
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split
url = 'https://raw.githubusercontent.com/adrianstando/imbalanced-benchmarking-set/main/datasets/SpeedDating.csv'
response = requests.get(url)
if response.status_code == 200:
with open('SpeedDating.csv', 'wb') as f:
f.write(response.content)
else:
print(f"Failed to download the file. Status code: {response.status_code}")
path = "SpeedDating.csv"
# first column is the index
df = pd.read_csv(path, index_col=0)
df
# train test split
train, test = train_test_split(df, test_size=0.8)
X_train, y_train = train.drop(columns=['TARGET']), train['TARGET']
X_test, y_test = test.drop(columns=['TARGET']), test['TARGET']
len(X_train), len(X_test)
# Train and evaluate logistic regression from sklearn
from sklearn.linear_model import LogisticRegression
clf_logistic_regression = LogisticRegression()
clf_logistic_regression.fit(X_train.values, y_train.values)
y_pred_logistic_regression = clf_logistic_regression.predict_proba(X_test)[:, 1]
# Train and evaluate XGBoost
clf_xgboost = xgb.XGBClassifier()
clf_xgboost.fit(X_train, y_train)
y_pred_xgboost = clf_xgboost.predict_proba(X_test)[:, 1]
from lime import lime_tabular
explainer = lime_tabular.LimeTabularExplainer(X_train.values, feature_names=X_train.columns, class_names=['0', '1'], discretize_continuous=True)
X_test_pos = X_test[y_test == 1]
y_test_pos = y_test[y_test == 1]
X_test_neg = X_test[y_test == 0]
y_test_neg = y_test[y_test == 0]
positive_examples = [X_test_pos.values[random.randint(0, len(X_test_pos))] for _ in range(0, 5)]
negative_examples = [X_test_neg.values[random.randint(0, len(X_test_neg))] for _ in range(0, 5)]
# show positive examples and their explanations for logistic regression
for positive_example in positive_examples:
print("LogisticRegression")
exp = explainer.explain_instance(positive_example, clf_logistic_regression.predict_proba, num_features=10)
exp.show_in_notebook(show_table=True, show_all=False)